In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
rainfall_data = pd.read_csv("rainfall_area-wt_India_1901-2015.csv")
print(rainfall_data.head())
REGION YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP \
0 INDIA 1901 34.7 37.7 18.0 39.3 50.8 113.4 242.2 272.9 124.4
1 INDIA 1902 7.4 4.3 19.0 43.5 48.3 108.8 284.0 199.7 201.5
2 INDIA 1903 17.0 8.3 31.3 17.1 59.5 118.3 297.0 270.4 199.1
3 INDIA 1904 14.4 9.6 31.8 33.1 72.4 164.8 261.0 206.4 129.6
4 INDIA 1905 25.3 20.9 42.7 33.7 55.7 93.3 252.8 200.8 178.4
OCT NOV DEC ANNUAL Jan-Feb Mar-May Jun-Sep Oct-Dec
0 52.7 38.0 8.3 1032.3 72.4 108.1 752.8 99.0
1 61.5 27.9 24.4 1030.2 11.7 110.8 794.0 113.8
2 117.9 36.9 17.7 1190.5 25.3 107.9 884.8 172.5
3 69.0 11.2 16.3 1019.8 24.0 137.4 761.8 96.6
4 51.4 9.7 10.5 975.3 46.2 132.2 725.4 71.6
In [2]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
import numpy as np
# Analyze annual rainfall trends with additional statistical insights
yearly_data = rainfall_data[['YEAR', 'ANNUAL']].copy()
# Calculate moving average for trend analysis
yearly_data['rolling_avg'] = yearly_data['ANNUAL'].rolling(window=10, center=True).mean()
# Create enhanced annual rainfall visualization
fig_trends = go.Figure()
# Add main rainfall line
fig_trends.add_trace(go.Scatter(
x=yearly_data['YEAR'],
y=yearly_data['ANNUAL'],
mode='lines+markers',
name='Annual Rainfall',
line=dict(color='steelblue', width=1.5),
marker=dict(size=3, opacity=0.6),
opacity=0.8
))
# Add 10-year moving average
fig_trends.add_trace(go.Scatter(
x=yearly_data['YEAR'],
y=yearly_data['rolling_avg'],
mode='lines',
name='10-Year Moving Average',
line=dict(color='darkred', width=3)
))
# Add overall mean line
mean_rainfall = yearly_data['ANNUAL'].mean()
fig_trends.add_trace(go.Scatter(
x=yearly_data['YEAR'],
y=[mean_rainfall] * len(yearly_data),
mode='lines',
name=f'Overall Mean ({mean_rainfall:.1f}mm)',
line=dict(color='orange', dash='dot', width=2)
))
fig_trends.update_layout(
title='Annual Rainfall Patterns in India with Trend Analysis (1901-2015)',
xaxis_title='Year',
yaxis_title='Rainfall (mm)',
template='plotly_white',
hovermode='x unified',
width=1000,
height=550,
showlegend=True
)
fig_trends.show()
# Enhanced monthly rainfall analysis with seasonal highlighting
month_names = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
monthly_stats = rainfall_data[month_names].mean()
# Define seasonal colors
season_colors = ['lightblue', 'lightblue', 'lightgreen', 'lightgreen', 'lightgreen',
'cyan', 'cyan', 'cyan', 'cyan', 'orange', 'orange', 'lightblue']
# Find peak and minimum months
peak_month = monthly_stats.idxmax()
dry_month = monthly_stats.idxmin()
fig_monthly_enhanced = go.Figure()
fig_monthly_enhanced.add_trace(go.Bar(
x=monthly_stats.index,
y=monthly_stats.values,
marker_color=season_colors,
marker_line_color='green',
marker_line_width=1.5,
text=[f'{val:.0f}' for val in monthly_stats.values],
textposition='outside',
name='Monthly Rainfall'
))
# Add mean line
fig_monthly_enhanced.add_hline(
y=monthly_stats.mean(),
line_dash="dash",
line_color="red",
line_width=2,
annotation_text=f"Monthly Average: {monthly_stats.mean():.0f}mm",
annotation_position="top left"
)
fig_monthly_enhanced.update_layout(
title=f'Monthly Rainfall Distribution - Peak: {peak_month} ({monthly_stats[peak_month]:.0f}mm), Lowest: {dry_month} ({monthly_stats[dry_month]:.0f}mm)',
xaxis_title='Month',
yaxis_title='Average Rainfall (mm)',
template='plotly_white',
width=1000,
height=550,
showlegend=False
)
fig_monthly_enhanced.show()
# Advanced seasonal analysis with comparative insights
seasons = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
season_labels = ['Winter', 'Pre-Monsoon', 'Monsoon', 'Post-Monsoon']
seasonal_data = rainfall_data[seasons].mean()
# Calculate percentage contribution of each season
total_seasonal = seasonal_data.sum()
seasonal_percentages = (seasonal_data / total_seasonal * 100).round(1)
fig_seasonal_pie = go.Figure()
# Create a combination chart - bar and pie
fig_seasonal_enhanced = go.Figure()
fig_seasonal_enhanced.add_trace(go.Bar(
x=season_labels,
y=seasonal_data.values,
marker_color=['lightsteelblue', 'gold', 'forestgreen', 'darkorange'],
marker_line_color='black',
marker_line_width=2,
text=[f'{val:.0f}mm<br>({pct}%)' for val, pct in zip(seasonal_data.values, seasonal_percentages)],
textposition='outside',
name='Seasonal Rainfall'
))
fig_seasonal_enhanced.update_layout(
title=f'Seasonal Rainfall Distribution with Percentage Contribution<br><sub> - Monsoon dominates with {seasonal_percentages.iloc[2]:.1f}% of annual rainfall</sub>',
xaxis_title='Season',
yaxis_title='Average Rainfall (mm)',
template='plotly_white',
height=550,
width=1000,
showlegend=False,
)
fig_seasonal_enhanced.show()
# Summary statistics
print(f"\n📊 RAINFALL ANALYSIS SUMMARY")
print(f"{'='*50}")
print(f"Peak Rainfall Month: {peak_month} ({monthly_stats[peak_month]:.0f}mm)")
print(f"Driest Month: {dry_month} ({monthly_stats[dry_month]:.0f}mm)")
print(f"Annual Average: {mean_rainfall:.0f}mm")
print(f"Monsoon Season Contribution: {seasonal_percentages.iloc[2]:.1f}% of total rainfall")
📊 RAINFALL ANALYSIS SUMMARY ================================================== Peak Rainfall Month: JUL (291mm) Driest Month: DEC (15mm) Annual Average: 1182mm Monsoon Season Contribution: 75.3% of total rainfall
In [3]:
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import numpy as np
from prettytable import PrettyTable
# Enhanced drought and extreme rainfall analysis with multiple statistical approaches
annual_mean = rainfall_data['ANNUAL'].mean()
annual_std = rainfall_data['ANNUAL'].std()
annual_median = rainfall_data['ANNUAL'].median()
# Define multiple severity thresholds for comprehensive analysis
moderate_drought_threshold = annual_mean - 1.0 * annual_std
severe_drought_threshold = annual_mean - 1.5 * annual_std
extreme_drought_threshold = annual_mean - 2.0 * annual_std
moderate_excess_threshold = annual_mean + 1.0 * annual_std
severe_excess_threshold = annual_mean + 1.5 * annual_std
extreme_excess_threshold = annual_mean + 2.0 * annual_std
# Categorize years by rainfall severity with enhanced classification
def classify_rainfall_year(annual_rainfall):
if annual_rainfall <= extreme_drought_threshold:
return 'Extreme Drought'
elif annual_rainfall <= severe_drought_threshold:
return 'Severe Drought'
elif annual_rainfall <= moderate_drought_threshold:
return 'Moderate Drought'
elif annual_rainfall >= extreme_excess_threshold:
return 'Extreme Excess'
elif annual_rainfall >= severe_excess_threshold:
return 'Severe Excess'
elif annual_rainfall >= moderate_excess_threshold:
return 'Moderate Excess'
else:
return 'Normal'
# Apply classification and calculate anomaly scores
rainfall_data_enhanced = rainfall_data.copy()
rainfall_data_enhanced['Rainfall_Category'] = rainfall_data_enhanced['ANNUAL'].apply(classify_rainfall_year)
rainfall_data_enhanced['Anomaly_Score'] = (rainfall_data_enhanced['ANNUAL'] - annual_mean) / annual_std
rainfall_data_enhanced['Percentile_Rank'] = rainfall_data_enhanced['ANNUAL'].rank(pct=True) * 100
# Filter extreme years with additional metrics
drought_years_enhanced = rainfall_data_enhanced[
rainfall_data_enhanced['ANNUAL'] <= severe_drought_threshold
].copy()
excess_years_enhanced = rainfall_data_enhanced[
rainfall_data_enhanced['ANNUAL'] >= severe_excess_threshold
].copy()
# Enhanced seasonal correlation analysis with multiple correlation methods
seasonal_periods = {
'Winter': 'Jan-Feb',
'Pre_Monsoon': 'Mar-May',
'Monsoon': 'Jun-Sep',
'Post_Monsoon': 'Oct-Dec'
}
correlation_analysis = {}
for season_name, season_col in seasonal_periods.items():
pearson_corr, pearson_p = pearsonr(rainfall_data[season_col], rainfall_data['ANNUAL'])
spearman_corr, spearman_p = spearmanr(rainfall_data[season_col], rainfall_data['ANNUAL'])
correlation_analysis[season_name] = {
'Pearson_Correlation': pearson_corr,
'Pearson_P_Value': pearson_p,
'Spearman_Correlation': spearman_corr,
'Spearman_P_Value': spearman_p,
'Significance': 'Significant' if pearson_p < 0.05 else 'Not Significant'
}
# Create comprehensive summary dataframes
drought_summary_enhanced = drought_years_enhanced[
['YEAR', 'ANNUAL', 'Rainfall_Category', 'Anomaly_Score', 'Percentile_Rank']
].sort_values('ANNUAL').reset_index(drop=True)
excess_summary_enhanced = excess_years_enhanced[
['YEAR', 'ANNUAL', 'Rainfall_Category', 'Anomaly_Score', 'Percentile_Rank']
].sort_values('ANNUAL', ascending=False).reset_index(drop=True)
correlation_summary_enhanced = pd.DataFrame.from_dict(correlation_analysis, orient='index')
# Calculate additional statistical insights
rainfall_statistics = {
'Total_Years_Analyzed': len(rainfall_data),
'Mean_Annual_Rainfall': annual_mean,
'Standard_Deviation': annual_std,
'Median_Rainfall': annual_median,
'Drought_Years_Count': len(drought_years_enhanced),
'Excess_Years_Count': len(excess_years_enhanced),
'Drought_Frequency_Percent': (len(drought_years_enhanced) / len(rainfall_data)) * 100,
'Excess_Frequency_Percent': (len(excess_years_enhanced) / len(rainfall_data)) * 100
}
statistics_summary = pd.DataFrame.from_dict(rainfall_statistics, orient='index', columns=['Value'])
# Create pretty tables for display
print("🌧️ ENHANCED RAINFALL ANALYSIS RESULTS")
print("=" * 60)
# Drought Years Table
print(f"\n📉 DROUGHT YEARS (≤ {severe_drought_threshold:.1f}mm)")
print("-" * 60)
drought_table = PrettyTable()
drought_table.field_names = ["Year", "Rainfall (mm)", "Category", "Anomaly Score", "Percentile Rank"]
drought_table.align["Year"] = "c"
drought_table.align["Rainfall (mm)"] = "r"
drought_table.align["Category"] = "l"
drought_table.align["Anomaly Score"] = "r"
drought_table.align["Percentile Rank"] = "r"
for _, row in drought_summary_enhanced.iterrows():
drought_table.add_row([
int(row['YEAR']),
f"{row['ANNUAL']:.1f}",
row['Rainfall_Category'],
f"{row['Anomaly_Score']:.2f}",
f"{row['Percentile_Rank']:.1f}%"
])
print(drought_table)
# Excess Years Table
print(f"\n📈 EXCESS RAINFALL YEARS (≥ {severe_excess_threshold:.1f}mm)")
print("-" * 60)
excess_table = PrettyTable()
excess_table.field_names = ["Year", "Rainfall (mm)", "Category", "Anomaly Score", "Percentile Rank"]
excess_table.align["Year"] = "c"
excess_table.align["Rainfall (mm)"] = "r"
excess_table.align["Category"] = "l"
excess_table.align["Anomaly Score"] = "r"
excess_table.align["Percentile Rank"] = "r"
for _, row in excess_summary_enhanced.iterrows():
excess_table.add_row([
int(row['YEAR']),
f"{row['ANNUAL']:.1f}",
row['Rainfall_Category'],
f"{row['Anomaly_Score']:.2f}",
f"{row['Percentile_Rank']:.1f}%"
])
print(excess_table)
# Seasonal Correlation Table
print(f"\n🔄 SEASONAL CORRELATION ANALYSIS")
print("-" * 60)
correlation_table = PrettyTable()
correlation_table.field_names = ["Season", "Pearson Corr", "Pearson P-Val", "Spearman Corr", "Spearman P-Val", "Significance"]
correlation_table.align["Season"] = "l"
correlation_table.align["Pearson Corr"] = "r"
correlation_table.align["P-Value"] = "r"
correlation_table.align["Spearman Corr"] = "r"
correlation_table.align["Significance"] = "c"
for season, data in correlation_analysis.items():
correlation_table.add_row([
season,
f"{data['Pearson_Correlation']:.4f}",
f"{data['Pearson_P_Value']:.4f}",
f"{data['Spearman_Correlation']:.4f}",
f"{data['Spearman_P_Value']:.4f}",
data['Significance']
])
print(correlation_table)
# Overall Statistics Table
print(f"\n📊 OVERALL STATISTICS SUMMARY")
print("-" * 60)
stats_table = PrettyTable()
stats_table.field_names = ["Statistic", "Value"]
stats_table.align["Statistic"] = "l"
stats_table.align["Value"] = "r"
# Format values appropriately
formatted_stats = {
'Total Years Analyzed': f"{rainfall_statistics['Total_Years_Analyzed']}",
'Mean Annual Rainfall': f"{rainfall_statistics['Mean_Annual_Rainfall']:.1f} mm",
'Standard Deviation': f"{rainfall_statistics['Standard_Deviation']:.1f} mm",
'Median Rainfall': f"{rainfall_statistics['Median_Rainfall']:.1f} mm",
'Drought Years Count': f"{rainfall_statistics['Drought_Years_Count']}",
'Excess Years Count': f"{rainfall_statistics['Excess_Years_Count']}",
'Drought Frequency': f"{rainfall_statistics['Drought_Frequency_Percent']:.1f}%",
'Excess Frequency': f"{rainfall_statistics['Excess_Frequency_Percent']:.1f}%"
}
for stat, value in formatted_stats.items():
stats_table.add_row([stat, value])
print(stats_table)
# Summary insights
print(f"\n💡 KEY INSIGHTS")
print("-" * 60)
insights_table = PrettyTable()
insights_table.field_names = ["Insight", "Description"]
insights_table.align["Insight"] = "l"
insights_table.align["Description"] = "l"
insights_table.max_width["Description"] = 50
# Find most correlated season
most_correlated_season = max(correlation_analysis.keys(),
key=lambda x: abs(correlation_analysis[x]['Pearson_Correlation']))
insights_table.add_row([
"Driest Year",
f"{int(drought_summary_enhanced.iloc[0]['YEAR'])} ({drought_summary_enhanced.iloc[0]['ANNUAL']:.1f}mm)"
])
insights_table.add_row([
"Wettest Year",
f"{int(excess_summary_enhanced.iloc[0]['YEAR'])} ({excess_summary_enhanced.iloc[0]['ANNUAL']:.1f}mm)"
])
insights_table.add_row([
"Most Influential Season",
f"{most_correlated_season} (r={correlation_analysis[most_correlated_season]['Pearson_Correlation']:.3f})"
])
insights_table.add_row([
"Extreme Event Frequency",
f"{(len(drought_years_enhanced) + len(excess_years_enhanced))} years ({((len(drought_years_enhanced) + len(excess_years_enhanced))/len(rainfall_data) * 100):.1f}%)"
])
print(insights_table)
🌧️ ENHANCED RAINFALL ANALYSIS RESULTS ============================================================ 📉 DROUGHT YEARS (≤ 1016.0mm) ------------------------------------------------------------ +------+---------------+-----------------+---------------+-----------------+ | Year | Rainfall (mm) | Category | Anomaly Score | Percentile Rank | +------+---------------+-----------------+---------------+-----------------+ | 2002 | 920.8 | Extreme Drought | -2.36 | 0.9% | | 1965 | 938.4 | Extreme Drought | -2.20 | 1.7% | | 1972 | 948.5 | Extreme Drought | -2.11 | 2.6% | | 2009 | 959.3 | Extreme Drought | -2.01 | 3.5% | | 1905 | 975.3 | Severe Drought | -1.87 | 4.3% | +------+---------------+-----------------+---------------+-----------------+ 📈 EXCESS RAINFALL YEARS (≥ 1348.1mm) ------------------------------------------------------------ +------+---------------+----------------+---------------+-----------------+ | Year | Rainfall (mm) | Category | Anomaly Score | Percentile Rank | +------+---------------+----------------+---------------+-----------------+ | 1917 | 1480.3 | Extreme Excess | 2.69 | 100.0% | | 1961 | 1403.0 | Severe Excess | 2.00 | 99.1% | | 1990 | 1400.6 | Severe Excess | 1.97 | 98.3% | | 1933 | 1393.5 | Severe Excess | 1.91 | 97.4% | | 1956 | 1386.2 | Severe Excess | 1.84 | 96.5% | | 1959 | 1382.1 | Severe Excess | 1.81 | 95.7% | | 1988 | 1351.0 | Severe Excess | 1.53 | 94.8% | +------+---------------+----------------+---------------+-----------------+ 🔄 SEASONAL CORRELATION ANALYSIS ------------------------------------------------------------ +--------------+--------------+---------------+---------------+----------------+--------------+ | Season | Pearson Corr | Pearson P-Val | Spearman Corr | Spearman P-Val | Significance | +--------------+--------------+---------------+---------------+----------------+--------------+ | Winter | 0.2289 | 0.0139 | 0.2301 | 0.0134 | Significant | | Pre_Monsoon | 0.3131 | 0.0007 | 0.2897 | 0.0017 | Significant | | Monsoon | 0.9300 | 0.0000 | 0.9124 | 0.0000 | Significant | | Post_Monsoon | 0.5316 | 0.0000 | 0.4755 | 0.0000 | Significant | +--------------+--------------+---------------+---------------+----------------+--------------+ 📊 OVERALL STATISTICS SUMMARY ------------------------------------------------------------ +----------------------+-----------+ | Statistic | Value | +----------------------+-----------+ | Total Years Analyzed | 115 | | Mean Annual Rainfall | 1182.0 mm | | Standard Deviation | 110.7 mm | | Median Rainfall | 1190.5 mm | | Drought Years Count | 5 | | Excess Years Count | 7 | | Drought Frequency | 4.3% | | Excess Frequency | 6.1% | +----------------------+-----------+ 💡 KEY INSIGHTS ------------------------------------------------------------ +-------------------------+-------------------+ | Insight | Description | +-------------------------+-------------------+ | Driest Year | 2002 (920.8mm) | | Wettest Year | 1917 (1480.3mm) | | Most Influential Season | Monsoon (r=0.930) | | Extreme Event Frequency | 12 years (10.4%) | +-------------------------+-------------------+
In [4]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
# Calculate monthly anomalies for each month across all years
monthly_columns = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
# Calculate long-term monthly means and standard deviations
monthly_means = rainfall_data[monthly_columns].mean()
monthly_stds = rainfall_data[monthly_columns].std()
# Define anomaly thresholds (1.5 standard deviations from mean)
anomaly_threshold = 1.5
# Create anomaly detection dataframe
anomaly_data = rainfall_data.copy()
# Calculate anomalies for each month
for month in monthly_columns:
anomaly_data[f'{month}_anomaly'] = (anomaly_data[month] - monthly_means[month]) / monthly_stds[month]
anomaly_data[f'{month}_is_anomaly'] = np.abs(anomaly_data[f'{month}_anomaly']) >= anomaly_threshold
# Create comprehensive monthly anomaly visualization
fig_monthly_anomalies = make_subplots(
rows=4, cols=3,
subplot_titles=monthly_columns,
shared_yaxes=True,
vertical_spacing=0.08,
horizontal_spacing=0.06
)
# Color scheme for anomalies
normal_color = 'lightblue'
anomaly_color = 'red'
for i, month in enumerate(monthly_columns):
row = (i // 3) + 1
col = (i % 3) + 1
# Get colors for each year based on anomaly status
colors = [anomaly_color if is_anomaly else normal_color
for is_anomaly in anomaly_data[f'{month}_is_anomaly']]
fig_monthly_anomalies.add_trace(
go.Scatter(
x=anomaly_data['YEAR'],
y=anomaly_data[month],
mode='markers',
marker=dict(
color=colors,
size=6,
line=dict(width=1, color='darkblue')
),
name=month,
showlegend=False,
hovertemplate=f'<b>{month}</b><br>' +
'Year: %{x}<br>' +
'Rainfall: %{y:.1f}mm<br>' +
'<extra></extra>'
),
row=row, col=col
)
# Add mean line
fig_monthly_anomalies.add_hline(
y=monthly_means[month],
line_dash="dash",
line_color="green",
line_width=1,
row=row, col=col
)
# Add anomaly threshold lines
upper_threshold = monthly_means[month] + anomaly_threshold * monthly_stds[month]
lower_threshold = monthly_means[month] - anomaly_threshold * monthly_stds[month]
fig_monthly_anomalies.add_hline(
y=upper_threshold,
line_dash="dot",
line_color="orange",
line_width=1,
row=row, col=col
)
fig_monthly_anomalies.add_hline(
y=lower_threshold,
line_dash="dot",
line_color="orange",
line_width=1,
row=row, col=col
)
fig_monthly_anomalies.update_layout(
title='Monthly Rainfall Anomalies Across All Years (1901-2015)<br><sub>Red dots indicate anomalies (>1.5σ from mean)</sub>',
height=1800,
width=1500,
template='plotly_white'
)
fig_monthly_anomalies.update_xaxes(title_text="Year")
fig_monthly_anomalies.update_yaxes(title_text="Rainfall (mm)")
fig_monthly_anomalies.show()
# Option 4: Box Plot of Anomaly Magnitudes by Month
fig_boxplot = go.Figure()
for month in monthly_columns:
anomaly_values = anomaly_data[anomaly_data[f'{month}_is_anomaly']][f'{month}_anomaly']
if len(anomaly_values) > 0:
fig_boxplot.add_trace(go.Box(
y=anomaly_values,
name=month,
boxpoints='all',
pointpos=0,
marker=dict(color=anomaly_color, size=4),
line=dict(color='darkred'),
fillcolor='rgba(255,0,0,0.3)'
))
fig_boxplot.update_layout(
title='Distribution of Anomaly Magnitudes by Month<br><sub>Statistical spread of extreme events - reveals which months show most variable anomaly patterns</sub>',
xaxis_title='Month',
yaxis_title='Anomaly Score (σ)',
height=600,
width=1200,
template='plotly_white'
)
fig_boxplot.show()
# Create anomaly summary statistics
anomaly_counts = {}
extreme_years = {}
for month in monthly_columns:
anomaly_years = anomaly_data[anomaly_data[f'{month}_is_anomaly']]
anomaly_counts[month] = len(anomaly_years)
if len(anomaly_years) > 0:
# Find most extreme year for this month
max_anomaly_idx = np.abs(anomaly_years[f'{month}_anomaly']).idxmax()
extreme_years[month] = {
'year': int(anomaly_years.loc[max_anomaly_idx, 'YEAR']),
'rainfall': anomaly_years.loc[max_anomaly_idx, month],
'anomaly_score': anomaly_years.loc[max_anomaly_idx, f'{month}_anomaly']
}
# Anomaly frequency bar chart
fig_anomaly_freq = go.Figure()
fig_anomaly_freq.add_trace(go.Bar(
x=monthly_columns,
y=[anomaly_counts[month] for month in monthly_columns],
marker_color=['red' if anomaly_counts[month] > 0 else 'lightgray' for month in monthly_columns],
marker_line_color='darkred',
marker_line_width=1.5,
text=[anomaly_counts[month] for month in monthly_columns],
textposition='outside'
))
fig_anomaly_freq.update_layout(
title='Frequency of Monthly Rainfall Anomalies (1901-2015)<br><sub>Number of years each month experienced anomalous rainfall</sub>',
xaxis_title='Month',
yaxis_title='Number of Anomalous Years',
template='plotly_white',
height=500,
width=1000,
showlegend=False
)
fig_anomaly_freq.show()
# Summary statistics
total_anomalies = sum(anomaly_counts.values())
most_anomalous_month = max(anomaly_counts.keys(), key=lambda x: anomaly_counts[x])
least_anomalous_month = min(anomaly_counts.keys(), key=lambda x: anomaly_counts[x])
print(f"\n🔍 MONTHLY RAINFALL ANOMALY ANALYSIS")
print("=" * 60)
print(f"📊 Total Anomalous Months: {total_anomalies}")
print(f"📈 Most Anomalous Month: {most_anomalous_month} ({anomaly_counts[most_anomalous_month]} years)")
print(f"📉 Least Anomalous Month: {least_anomalous_month} ({anomaly_counts[least_anomalous_month]} years)")
print(f"⚡ Anomaly Threshold: ±{anomaly_threshold} standard deviations")
print(f"\n🎯 MOST EXTREME EVENTS BY MONTH")
print("-" * 60)
for month in monthly_columns:
if month in extreme_years:
extreme = extreme_years[month]
print(f"{month}: {extreme['year']} ({extreme['rainfall']:.1f}mm, σ={extreme['anomaly_score']:.2f})")
else:
print(f"{month}: No anomalies detected")
# Calculate decade-wise anomaly trends
anomaly_data['DECADE'] = (anomaly_data['YEAR'] // 10) * 10
decade_anomalies = anomaly_data.groupby('DECADE').apply(
lambda x: sum([x[f'{month}_is_anomaly'].sum() for month in monthly_columns])
).reset_index()
decade_anomalies.columns = ['DECADE', 'ANOMALY_COUNT']
print(f"\n📅 ANOMALIES BY DECADE")
print("-" * 60)
for _, row in decade_anomalies.iterrows():
decade_label = f"{int(row['DECADE'])}s"
print(f"{decade_label}: {int(row['ANOMALY_COUNT'])} anomalous months")
🔍 MONTHLY RAINFALL ANOMALY ANALYSIS ============================================================ 📊 Total Anomalous Months: 158 📈 Most Anomalous Month: FEB (17 years) 📉 Least Anomalous Month: DEC (9 years) ⚡ Anomaly Threshold: ±1.5 standard deviations 🎯 MOST EXTREME EVENTS BY MONTH ------------------------------------------------------------ JAN: 1943 (58.5mm, σ=3.88) FEB: 1937 (53.8mm, σ=2.64) MAR: 1967 (63.3mm, σ=2.85) APR: 2015 (69.4mm, σ=3.01) MAY: 1990 (114.5mm, σ=3.34) JUN: 1938 (275.5mm, σ=3.01) JUL: 2002 (138.9mm, σ=-3.70) AUG: 1926 (335.5mm, σ=2.20) SEP: 1917 (281.0mm, σ=2.96) OCT: 1917 (158.8mm, σ=2.94) NOV: 1979 (74.2mm, σ=2.79) DEC: 1967 (54.4mm, σ=4.49) 📅 ANOMALIES BY DECADE ------------------------------------------------------------ 1900s: 21 anomalous months 1910s: 20 anomalous months 1920s: 17 anomalous months 1930s: 13 anomalous months 1940s: 13 anomalous months 1950s: 17 anomalous months 1960s: 9 anomalous months 1970s: 13 anomalous months 1980s: 7 anomalous months 1990s: 10 anomalous months 2000s: 10 anomalous months 2010s: 8 anomalous months
In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
# Data Preparation with Enhanced Validation
seasonal_features = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec', 'ANNUAL']
rainfall_features = rainfall_data[seasonal_features]
# Advanced Standardization with Variance Check
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rainfall_features)
assert np.isfinite(scaled_features).all(), "Missing/infinite values detected after scaling"
# Optimal Cluster Determination using Elbow Method [1][15][17]
inertias = []
silhouettes = []
k_range = range(2, 8)
for k in k_range:
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_features)
inertias.append(kmeans.inertia_)
silhouettes.append(silhouette_score(scaled_features, cluster_labels))
# Automated Elbow Detection [19]
elbow_k = np.argmax(np.diff(np.diff(inertias))**2) + 2 # 2nd derivative maximization
optimal_k = max(3, min(elbow_k, 6)) # Constrain between 3-6 clusters
# Final Clustering with Optimal K [12]
final_kmeans = KMeans(n_clusters=optimal_k, n_init=20, random_state=42)
rainfall_data['Rainfall_Cluster'] = final_kmeans.fit_predict(scaled_features)
# Cluster Labeling Based on Centroid Analysis [9][10]
centroids = scaler.inverse_transform(final_kmeans.cluster_centers_)
sorted_indices = np.argsort(centroids[:, -1]) # Sort by annual rainfall
cluster_labels = {i: ['Severe Dry', 'Dry', 'Normal', 'Wet', 'Extreme Wet'][i]
for i in range(optimal_k)}
rainfall_data['Rainfall_Category'] = rainfall_data['Rainfall_Cluster'].map(cluster_labels)
# Enhanced Visualization with Cluster Trends [5][14]
fig = px.scatter(
rainfall_data,
x='YEAR',
y='ANNUAL',
color='Rainfall_Category',
title=f'Rainfall Pattern Clustering (Optimal K={optimal_k})',
labels={'YEAR': 'Year', 'ANNUAL': 'Annual Rainfall (mm)'},
hover_data=seasonal_features,
# trendline='lowess',
# trendline_color_override='black'
)
# Add Cluster Centroids [9]
centroid_years = rainfall_data.groupby('Rainfall_Category')['YEAR'].median().values
for i, (year, centroid) in enumerate(zip(centroid_years, centroids)):
fig.add_trace(go.Scatter(
x=[year],
y=[centroid[-1]],
mode='markers',
marker=dict(size=12, line=dict(width=2)),
name=f'Cluster {i+1} Center'
))
fig.update_layout(
template='plotly_white',
height=700,
legend_title_text='Rainfall Category',
xaxis_rangeslider_visible=True
)
# Save Outputs [7]
# fig.write_html('rainfall_cluster_analysis.html')
# rainfall_data.to_csv('clustered_rainfall_data.csv', index=False)
fig.show()
In [6]:
import pandas as pd
import numpy as np
# use: pip install prophet
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
# Prepare the data for Prophet
rainfall_data['DATE'] = pd.to_datetime(rainfall_data['YEAR'].astype(str) + '-12-31')
annual_rainfall_ts = rainfall_data.set_index('DATE')['ANNUAL']
# Create Prophet dataframe
prophet_data = annual_rainfall_ts.reset_index()
prophet_data.columns = ['ds', 'y']
# Initialize and fit Prophet model
prophet_model = Prophet(
yearly_seasonality=True,
changepoint_prior_scale=0.1,
interval_width=0.8
)
prophet_model.fit(prophet_data)
# Create future dataframe for the next 20 years
future = prophet_model.make_future_dataframe(periods=20, freq='YE')
forecast = prophet_model.predict(future)
# Create forecast plot
fig_forecast = plot_plotly(prophet_model, forecast)
fig_forecast.update_layout(
title='Annual Rainfall Forecast Using Prophet (1901-2035)<sub>:20-year projection with confidence intervals based on historical patterns</sub>',
xaxis_title='Year',
yaxis_title='Annual Rainfall (mm)',
# template='plotly_white',
height=900,
width=1500
)
fig_forecast.show()
# Show model components
fig_components = plot_components_plotly(prophet_model, forecast)
fig_components.update_layout(
title='Prophet Model Components<br><sub>Trend and seasonal patterns in rainfall data</sub>',
template='plotly_white',
height=500
)
fig_components.show()
# Basic forecast summary
forecast_period = forecast[forecast['ds'] > prophet_data['ds'].max()]
historical_mean = prophet_data['y'].mean()
forecast_mean = forecast_period['yhat'].mean()
print(f"Historical Average (1901-2015): {historical_mean:.1f} mm")
print(f"Forecast Average (2016-2035): {forecast_mean:.1f} mm")
print(f"Projected Change: {forecast_mean - historical_mean:+.1f} mm ({((forecast_mean/historical_mean-1)*100):+.1f}%)")
10:39:36 - cmdstanpy - INFO - Chain [1] start processing 10:39:36 - cmdstanpy - INFO - Chain [1] done processing
Historical Average (1901-2015): 1182.0 mm Forecast Average (2016-2035): 1104.7 mm Projected Change: -77.3 mm (-6.5%)